MATH 422: Time Series Analysis

Fall 2022

Read data

setwd('/Users/mandyhong/Desktop/MATH 422')
allHC = read.csv('all_race_hc_ts.csv')
blackHC=read.csv('black_hc_ts.csv')
asianHC=read.csv('asian_hc_ts.csv')

0. Descriptive Analysis

Get time series plots for all three data and summary statistics

tsplot(allHC$hate_crime)

tsplot(blackHC$hate_crime)

tsplot(asianHC$hate_crime)

culer = c(rgb(.85,.30,.12,.6), rgb(.12,.65,.85,.6), "aquamarine3")
tsplot(allHC$hate_crime, col=culer[1], lwd=2,  pch=20, ylim=c(min(asianHC$hate_crime), max(allHC$hate_crime))
,ylab="Hate crimes", main="Racial Hate Crimes")
lines(blackHC$hate_crime, col=culer[2], lwd=2, pch=20)
lines(asianHC$hate_crime, col=culer[3], lwd=2, pch=20)
legend("topleft", col=culer, lty=1, lwd=2, pch=20, legend=c("All race", "African American/Black", "Asian"), bg="white") 

culer = c(rgb(.85,.30,.12,.6), rgb(.12,.65,.85,.6), "aquamarine3")
tsplot(log(allHC$hate_crime), col=culer[1], lwd=2,  pch=20, ylim=c(min(log(asianHC$hate_crime)), max(log(allHC$hate_crime)+1))
,ylab="log(Hate crimes)", main="Racial Hate Crimes")
lines(log(blackHC$hate_crime), col=culer[2], lwd=2, pch=20)
lines(log(asianHC$hate_crime), col=culer[3], lwd=2, pch=20)
legend("topleft", col=culer, lty=1, lwd=2, pch=20, legend=c("All race", "African American/Black", "Asian"), bg="white") 

summary(allHC)
##       year          month         hate_crime    
##  Min.   :1991   Min.   : 1.00   Min.   : 198.0  
##  1st Qu.:1998   1st Qu.: 3.75   1st Qu.: 319.0  
##  Median :2006   Median : 6.50   Median : 379.0  
##  Mean   :2006   Mean   : 6.50   Mean   : 389.1  
##  3rd Qu.:2013   3rd Qu.: 9.25   3rd Qu.: 448.0  
##  Max.   :2020   Max.   :12.00   Max.   :1329.0
summary(blackHC)
##       year          month         hate_crime   
##  Min.   :1991   Min.   : 1.00   Min.   : 88.0  
##  1st Qu.:1998   1st Qu.: 3.75   1st Qu.:167.0  
##  Median :2006   Median : 6.50   Median :202.0  
##  Mean   :2006   Mean   : 6.50   Mean   :207.7  
##  3rd Qu.:2013   3rd Qu.: 9.25   3rd Qu.:242.5  
##  Max.   :2020   Max.   :12.00   Max.   :693.0
summary(asianHC)
##       year          month         hate_crime   
##  Min.   :1991   Min.   : 1.00   Min.   : 3.00  
##  1st Qu.:1998   1st Qu.: 3.75   1st Qu.:12.00  
##  Median :2006   Median : 6.50   Median :17.00  
##  Mean   :2006   Mean   : 6.50   Mean   :17.84  
##  3rd Qu.:2013   3rd Qu.: 9.25   3rd Qu.:23.00  
##  Max.   :2020   Max.   :12.00   Max.   :51.00

Check outliers and describe them

allHC$difference<-c(0,diff(allHC$hate_crime))
iqr = IQR(diff(allHC$hate_crime))
Q <- quantile(allHC$difference, probs=c(.25, .75), na.rm = FALSE)
high <-  Q[2]+1.5*iqr 
low <- Q[1]-1.5*iqr 
tsplot(allHC$difference, main="Detecting Outliers Using IQR Score")
abline(a = high, 0, lty = 2, col = 'red')
abline(a = low, 0, lty = 2, col = 'red')

blackHC$difference<-c(0,diff(blackHC$hate_crime))
iqr = IQR(diff(blackHC$hate_crime))
Q <- quantile(blackHC$difference, probs=c(.25, .75), na.rm = FALSE)
#Qtest<-quantile(blackHC$difference)
#Qtest[4] #3rd quntile
high <-  Q[2]+1.5*iqr 
low <- Q[1]-1.5*iqr 
tsplot(blackHC$difference,main="Detecting Outliers Using IQR Score")
abline(a = high, 0, lty = 2, col = 'red')
abline(a = low, 0, lty = 2, col = 'red')

#blackHC[blackHC$difference > high, ]
#blackHC[blackHC$difference < low, ]
#outliers=c(which(blackHC$difference > high), which(blackHC$difference < low)) #12 outliers
#blackHC_no_outliers=blackHC[-outliers,]
#tsplot(blackHC_no_outliers$difference) #getting rid of outliers look stationary. In this case can we exclude outliers?
#tsplot(diff(blackHC_no_outliers$hate_crime)) #why do this plot and the plot above looks different?

#check for the outliers
asianHC$difference<-c(0,diff(asianHC$hate_crime))
iqr = IQR(diff(asianHC$hate_crime))
Q <- quantile(asianHC$difference, probs=c(.25, .75), na.rm = FALSE)
high <-  Q[2]+1.5*iqr 
low <- Q[1]-1.5*iqr 
tsplot(asianHC$difference, main="Detecting Outliers Using IQR Score")
abline(a = high, 0, lty = 2, col = 'red')
abline(a = low, 0, lty = 2, col = 'red')

#asianHC[asianHC$difference > high, ]
#asianHC[asianHC$difference < low, ]
#outliers=c(which(asianHC$difference > high), which(asianHC$difference < low)) #8 outliers
#asianHC_no_outliers=asianHC[-outliers,]

1. All racial hate crimes

Get time series plots for all racial hate crimes

ts1 <- xts(allHC$hate_crime, as.POSIXct(sprintf("%d-%d-01", allHC$year, allHC$month)))
ts2 <- xts(allHC$hate_crime, as.yearmon(allHC$year + (allHC$month-1)/12))

#plot(ts1)
plot(ts2, main="All Racial Hate Crimes")

#balckHC=as.ts(blackHC)
#asianHC=as.ts(asianHC)

Fit function of time model

#start with linear model
t=1:360
mod1HC=lm(hate_crime~t, data=allHC)
summary(mod1HC) #Adjusted R-squared:  0.06522 
## 
## Call:
## lm(formula = hate_crime ~ t, data = allHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -241.68  -63.07   -7.12   50.68  925.42 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 439.96051   11.50834  38.230  < 2e-16 ***
## t            -0.28199    0.05525  -5.104 5.42e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 109 on 358 degrees of freedom
## Multiple R-squared:  0.06782,    Adjusted R-squared:  0.06522 
## F-statistic: 26.05 on 1 and 358 DF,  p-value: 5.424e-07
#checking stationarity
tsplot(mod1HC$residuals) #didn't changed much from the original tsplot

plot(mod1HC$residuals) #doesn't look stationray, fit quadratic trend
lines(lowess(mod1HC$residuals))

#quadratic model
mod2HC=lm(hate_crime~t+I(t^2), data=allHC)
summary(mod2HC) #Adjusted R-squared:  0.06488 , didn't improve
## 
## Call:
## lm(formula = hate_crime ~ t + I(t^2), data = allHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -229.80  -61.02   -8.74   48.63  920.90 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.279e+02  1.733e+01  24.696   <2e-16 ***
## t           -8.180e-02  2.216e-01  -0.369    0.712    
## I(t^2)      -5.545e-04  5.946e-04  -0.933    0.352    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 109 on 357 degrees of freedom
## Multiple R-squared:  0.07009,    Adjusted R-squared:  0.06488 
## F-statistic: 13.45 on 2 and 357 DF,  p-value: 2.329e-06
tsplot(mod2HC$residuals) #didn't changed much from the original tsplot

plot(mod2HC$residuals) #doesn't look stationray, seasonality
lines(lowess(mod2HC$residuals))

#cosine model
allHC$Xcos=cos(2*pi*t/12)
allHC$Xsin=sin(2*pi*t/12)

mod3HC=lm(hate_crime~t+I(t^2)+Xcos+Xsin,data=allHC)
summary(mod3HC) #Adjusted R-squared:  0.2178 , improved. t and t^2 not sig.
## 
## Call:
## lm(formula = hate_crime ~ t + I(t^2) + Xcos + Xsin, data = allHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -221.22  -58.64   -7.05   36.36  888.85 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.285e+02  1.585e+01  27.039  < 2e-16 ***
## t           -8.650e-02  2.027e-01  -0.427    0.670    
## I(t^2)      -5.499e-04  5.438e-04  -1.011    0.313    
## Xcos        -5.424e+01  7.429e+00  -7.301 1.89e-12 ***
## Xsin        -3.193e+01  7.431e+00  -4.297 2.24e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99.67 on 355 degrees of freedom
## Multiple R-squared:  0.2265, Adjusted R-squared:  0.2178 
## F-statistic: 25.98 on 4 and 355 DF,  p-value: < 2.2e-16
tsplot(mod3HC$residuals) #looks like it got rid of seasonal trend, but still not stationary

plot(mod3HC$residuals)
lines(lowess(mod3HC$residuals))#better than two models from above

#cosine model, dropping trend terms
mod4HC=lm(hate_crime~Xcos+Xsin,data=allHC)
summary(mod4HC) #Adjusted R-squared:  0.1503 not better than above
## 
## Call:
## lm(formula = hate_crime ~ Xcos + Xsin, data = allHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -182.59  -71.94   -9.86   49.94  909.07 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  389.061      5.475  71.066  < 2e-16 ***
## Xcos         -54.528      7.742  -7.043 9.74e-12 ***
## Xsin         -30.868      7.742  -3.987 8.12e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 103.9 on 357 degrees of freedom
## Multiple R-squared:  0.155,  Adjusted R-squared:  0.1503 
## F-statistic: 32.75 on 2 and 357 DF,  p-value: 8.745e-14
tsplot(mod4HC$residuals) #looks like it got rid of seasonal trend, but still not stationary

plot(mod4HC$residuals)
lines(lowess(mod4HC$residuals))#better than first two models from above

#seasonal model
mod5HC=lm(hate_crime~t+as.factor(month)+0, data=allHC)
summary(mod5HC) #Adjusted R-squared:  0.9417 #better than all above, maybe overfitting?
## 
## Call:
## lm(formula = hate_crime ~ t + as.factor(month) + 0, data = allHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -221.73  -53.75   -2.05   37.68  866.72 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## t                   -0.28353    0.04964  -5.712  2.4e-08 ***
## as.factor(month)1  376.21794   19.86072  18.943  < 2e-16 ***
## as.factor(month)2  377.16814   19.88248  18.970  < 2e-16 ***
## as.factor(month)3  440.58500   19.90435  22.135  < 2e-16 ***
## as.factor(month)4  446.93520   19.92631  22.429  < 2e-16 ***
## as.factor(month)5  481.78540   19.94837  24.152  < 2e-16 ***
## as.factor(month)6  472.26893   19.97053  23.648  < 2e-16 ***
## as.factor(month)7  485.21913   19.99279  24.270  < 2e-16 ***
## as.factor(month)8  476.20266   20.01514  23.792  < 2e-16 ***
## as.factor(month)9  498.85286   20.03760  24.896  < 2e-16 ***
## as.factor(month)10 484.46972   20.06015  24.151  < 2e-16 ***
## as.factor(month)11 402.81992   20.08280  20.058  < 2e-16 ***
## as.factor(month)12 340.33678   20.10555  16.928  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 97.82 on 347 degrees of freedom
## Multiple R-squared:  0.9438, Adjusted R-squared:  0.9417 
## F-statistic:   448 on 13 and 347 DF,  p-value: < 2.2e-16
tsplot(mod5HC$residuals) #looks like it got rid of seasonal trend, but still not stationary

plot(mod5HC$residuals)
lines(lowess(mod5HC$residuals))#better than first two models from above, still not really linear

AIC(mod5HC) #4336.28
## [1] 4336.28
BIC(mod5HC) #4390.685
## [1] 4390.685
acf(mod5HC$residuals) #bad

None of the model fixed issue of non-linearity observed from the residuals. We cannot use function of time models for the prediction.

Check stationarity of the data and transform data for SARIMA model fitting

tsplot(allHC$hate_crime)

acf(allHC$hate_crime, lag=100) #suggests seasonal differencing

tsplot(diff(allHC$hate_crime, lag=12))

acf(diff(allHC$hate_crime, lag=12)) #seasonal patterns goes off

tsplot(diff(allHC$hate_crime))

acf(diff(allHC$hate_crime), lag=100) #definitely need seasonal differencing

tsplot(log(allHC$hate_crime))  #did not fix the issue with non-constant varaince a lot

acf(log(allHC$hate_crime)) #has some seasonal stuff

tsplot(diff(log(allHC$hate_crime), lag=12))

acf(diff(log(allHC$hate_crime), lag=12))

tsplot(diff(log(allHC$hate_crime)))

acf(diff(log(allHC$hate_crime)), lag=100) #seasonal stuff

#Seasonal differencing makes sense. Still need to check stationarity
adf.test(diff(allHC$hate_crime, lag=12))
## Warning in adf.test(diff(allHC$hate_crime, lag = 12)): p-value smaller than
## printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(allHC$hate_crime, lag = 12)
## Dickey-Fuller = -4.7544, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(allHC$hate_crime, lag=12)) 
## Warning in pp.test(diff(allHC$hate_crime, lag = 12)): p-value smaller than
## printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff(allHC$hate_crime, lag = 12)
## Dickey-Fuller Z(alpha) = -190.63, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(allHC$hate_crime, lag=12))
## Warning in kpss.test(diff(allHC$hate_crime, lag = 12)): p-value greater than
## printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff(allHC$hate_crime, lag = 12)
## KPSS Level = 0.23177, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(diff(allHC$hate_crime, lag=12))) #pass all, stationary. We can proceed with fitting SARIMA model
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -841.93  -45.46  -12.03   17.39  802.40 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.07861    0.03157  -2.490  0.01325 *  
## yd.diff.lag1 -0.43439    0.05832  -7.448 7.96e-13 ***
## yd.diff.lag2 -0.35763    0.06169  -5.797 1.55e-08 ***
## yd.diff.lag3 -0.16949    0.06018  -2.817  0.00514 ** 
## yd.diff.lag4 -0.10401    0.05409  -1.923  0.05533 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 94.55 on 338 degrees of freedom
## Multiple R-squared:  0.2332, Adjusted R-squared:  0.2219 
## F-statistic: 20.56 on 5 and 338 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -2.4902 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62

Fit appropriate SARIMA model

tsplot(diff(allHC$hate_crime, lag=12))

acf(diff(allHC$hate_crime, lag=12), lag=50) #cuts off at 12, SMA(1)

pacf(diff(allHC$hate_crime, lag=12), lag=50) #lag 1, 3 significant. 12 and 13 significant. 24 significant. AR(1)? SAR(2)?

#1. try SMA(1) which was most obvious
mod6HC=Arima(allHC$hate_crime,order=c(0,0,0), seasonal=list(order=c(0,1,1),period=12), include.drift = TRUE)
summary(mod6HC) #AIC=4185.99   AICc=4186.06   BIC=4197.55
## Series: allHC$hate_crime 
## ARIMA(0,0,0)(0,1,1)[12] with drift 
## 
## Coefficients:
##          sma1   drift
##       -0.5361  0.2571
## s.e.   0.0612  0.2225
## 
## sigma^2 = 9582:  log likelihood = -2090
## AIC=4185.99   AICc=4186.06   BIC=4197.55
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE     MASE      ACF1
## Training set 3.478633 95.96676 55.20574 -1.895506 13.35359 1.076936 0.5564345
coeftest(mod6HC) #drift not sig.
## 
## z test of coefficients:
## 
##        Estimate Std. Error z value Pr(>|z|)    
## sma1  -0.536100   0.061206 -8.7589   <2e-16 ***
## drift  0.257133   0.222515  1.1556   0.2479    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod6.2HC=Arima(allHC$hate_crime,order=c(0,0,0), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
summary(mod6.2HC) #AIC=4185.42   AICc=4185.46   BIC=4193.12, better AIC and BIC. won't include drift term
## Series: allHC$hate_crime 
## ARIMA(0,0,0)(0,1,1)[12] 
## 
## Coefficients:
##          sma1
##       -0.5506
## s.e.   0.0621
## 
## sigma^2 = 9587:  log likelihood = -2090.71
## AIC=4185.42   AICc=4185.46   BIC=4193.12
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE     MASE      ACF1
## Training set 9.528176 96.12702 54.42133 -0.2716837 13.01058 1.061634 0.5570456
coeftest(mod6.2HC)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## sma1 -0.550624   0.062084  -8.869 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#chek stationarity of the residual
tsplot(mod6.2HC$residuals)

adf.test(mod6.2HC$residuals)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod6.2HC$residuals
## Dickey-Fuller = -3.5799, Lag order = 7, p-value = 0.03516
## alternative hypothesis: stationary
pp.test(mod6.2HC$residuals)
## Warning in pp.test(mod6.2HC$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod6.2HC$residuals
## Dickey-Fuller Z(alpha) = -171.37, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod6.2HC$residuals) #0.03812
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod6.2HC$residuals
## KPSS Level = 0.51574, Truncation lag parameter = 5, p-value = 0.03812
summary(ur.ers(mod6.2HC$residuals))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -379.06  -26.76   -1.19   24.67  847.80 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.29824    0.05939  -5.021 8.19e-07 ***
## yd.diff.lag1 -0.23778    0.06849  -3.472 0.000582 ***
## yd.diff.lag2 -0.20396    0.06734  -3.029 0.002638 ** 
## yd.diff.lag3 -0.05792    0.06209  -0.933 0.351521    
## yd.diff.lag4 -0.01885    0.05448  -0.346 0.729639    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.83 on 350 degrees of freedom
## Multiple R-squared:  0.2542, Adjusted R-squared:  0.2435 
## F-statistic: 23.86 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -5.0215 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod6.2HC$residuals, lag=50) #decay, AR

pacf(mod6.2HC$residuals, lag=50) #AR(2), also lag 24 is significant

#2. add AR(2) term
mod7HC=Arima(allHC$hate_crime,order=c(2,0,0), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
summary(mod7HC) #AIC=4034.43   AICc=4034.55   BIC=4049.84, improved
## Series: allHC$hate_crime 
## ARIMA(2,0,0)(0,1,1)[12] 
## 
## Coefficients:
##          ar1     ar2     sma1
##       0.5433  0.1385  -0.8318
## s.e.  0.0540  0.0535   0.0503
## 
## sigma^2 = 5996:  log likelihood = -2013.22
## AIC=4034.43   AICc=4034.55   BIC=4049.84
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE    MAPE      MASE        ACF1
## Training set 1.766899 75.80656 38.07757 -1.348706 9.17772 0.7428053 -0.02594358
coeftest(mod7HC)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value Pr(>|z|)    
## ar1   0.543260   0.053978  10.064  < 2e-16 ***
## ar2   0.138463   0.053544   2.586  0.00971 ** 
## sma1 -0.831829   0.050307 -16.535  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tsplot(mod7HC$residuals) #looks better

adf.test(mod7HC$residuals)
## Warning in adf.test(mod7HC$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod7HC$residuals
## Dickey-Fuller = -4.6994, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod7HC$residuals)
## Warning in pp.test(mod7HC$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod7HC$residuals
## Dickey-Fuller Z(alpha) = -361.01, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod7HC$residuals) #0.01
## Warning in kpss.test(mod7HC$residuals): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod7HC$residuals
## KPSS Level = 0.80086, Truncation lag parameter = 5, p-value = 0.01
summary(ur.ers(mod7HC$residuals))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -243.90  -28.73   -3.28   23.44  859.32 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.93912    0.12439  -7.550 3.83e-13 ***
## yd.diff.lag1 -0.08793    0.11243  -0.782   0.4347    
## yd.diff.lag2 -0.20683    0.09729  -2.126   0.0342 *  
## yd.diff.lag3 -0.12013    0.07640  -1.572   0.1168    
## yd.diff.lag4 -0.05885    0.05345  -1.101   0.2717    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75.77 on 350 degrees of freedom
## Multiple R-squared:  0.5263, Adjusted R-squared:  0.5196 
## F-statistic: 77.78 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -7.5495 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod7HC$residuals, lag=50) #several lags with small significance

pacf(mod7HC$residuals, lag=50) #lag 2 is significant

#3. try auto.arima
mod8HC=auto.arima(allHC$hate_crime)
summary(mod8HC) #suggests ARIMA(1,1,1), AIC=4192.2   AICc=4192.27   BIC=4203.85, not better
## Series: allHC$hate_crime 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.5375  -0.9395
## s.e.  0.0545   0.0221
## 
## sigma^2 = 6805:  log likelihood = -2093.1
## AIC=4192.2   AICc=4192.27   BIC=4203.85
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 4.016691 82.14928 46.08507 -1.519288 11.67812 0.8990131 0.01779191
tsplot(mod8HC$residuals) #looks better

adf.test(mod8HC$residuals)
## Warning in adf.test(mod8HC$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod8HC$residuals
## Dickey-Fuller = -7.2941, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod8HC$residuals)
## Warning in pp.test(mod8HC$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod8HC$residuals
## Dickey-Fuller Z(alpha) = -340.79, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod8HC$residuals)
## Warning in kpss.test(mod8HC$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod8HC$residuals
## KPSS Level = 0.27729, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod8HC$residuals)) #passed all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -311.13  -31.12    4.98   31.42  880.66 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.08007    0.11872  -9.098   <2e-16 ***
## yd.diff.lag1  0.10104    0.10600   0.953    0.341    
## yd.diff.lag2  0.08494    0.09186   0.925    0.356    
## yd.diff.lag3  0.08215    0.07494   1.096    0.274    
## yd.diff.lag4  0.07490    0.05354   1.399    0.163    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 83.05 on 350 degrees of freedom
## Multiple R-squared:  0.4917, Adjusted R-squared:  0.4844 
## F-statistic: 67.71 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -9.0977 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod8HC$residuals, lag=100) #need seasonal diff.

pacf(mod8HC$residuals, lag=50) #lag 12 is significant and cuts off. SAR(1)

#4. try adding additional terms to mod7HC and see if model improves/fix problem from acf and pacf
#add ma(1)
mod9HC=Arima(allHC$hate_crime,order=c(2,0,1), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
summary(mod9HC) #AIC=4017.1   AICc=4017.27   BIC=4036.36, improved
## Series: allHC$hate_crime 
## ARIMA(2,0,1)(0,1,1)[12] 
## 
## Coefficients:
##          ar1      ar2      ma1     sma1
##       1.3100  -0.3239  -0.8332  -0.8858
## s.e.  0.0924   0.0832   0.0658   0.0443
## 
## sigma^2 = 5642:  log likelihood = -2003.55
## AIC=4017.1   AICc=4017.27   BIC=4036.36
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 3.504163 73.42226 35.84429 -0.5534322 8.520611 0.6992392
##                     ACF1
## Training set 0.008615474
coeftest(mod9HC) #all sig.
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ar1   1.309959   0.092409  14.1757 < 2.2e-16 ***
## ar2  -0.323904   0.083188  -3.8936 9.875e-05 ***
## ma1  -0.833242   0.065763 -12.6704 < 2.2e-16 ***
## sma1 -0.885843   0.044287 -20.0022 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tsplot(mod9HC$residuals) #looks okay

adf.test(mod9HC$residuals)
## Warning in adf.test(mod9HC$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod9HC$residuals
## Dickey-Fuller = -6.0964, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod9HC$residuals)
## Warning in pp.test(mod9HC$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod9HC$residuals
## Dickey-Fuller Z(alpha) = -348.9, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod9HC$residuals) 
## Warning in kpss.test(mod9HC$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod9HC$residuals
## KPSS Level = 0.19751, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod9HC$residuals)) #passed all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -226.56  -23.99   -0.16   20.68  854.36 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.00651    0.11986  -8.397 1.15e-15 ***
## yd.diff.lag1  0.02163    0.10765   0.201    0.841    
## yd.diff.lag2 -0.03271    0.09368  -0.349    0.727    
## yd.diff.lag3  0.01588    0.07529   0.211    0.833    
## yd.diff.lag4  0.02188    0.05364   0.408    0.684    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74.27 on 350 degrees of freedom
## Multiple R-squared:  0.4963, Adjusted R-squared:  0.4891 
## F-statistic: 68.97 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -8.3974 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod9HC$residuals, lag=50) #looks good

pacf(mod9HC$residuals, lag=50) #looks good

#5. fixed problems identified by acf and pacf, now add terms to see if mod9HC is the best model
#add SAR(1)
mod10HC=Arima(allHC$hate_crime,order=c(2,0,1), seasonal=list(order=c(1,1,1),period=12), include.drift = FALSE)
summary(mod10HC) #AIC=4018.41   AICc=4018.65   BIC=4041.52, got worse
## Series: allHC$hate_crime 
## ARIMA(2,0,1)(1,1,1)[12] 
## 
## Coefficients:
##          ar1      ar2      ma1    sar1     sma1
##       1.3095  -0.3248  -0.8314  0.0598  -0.9035
## s.e.  0.0970   0.0861   0.0706  0.0722   0.0476
## 
## sigma^2 = 5636:  log likelihood = -2003.2
## AIC=4018.41   AICc=4018.65   BIC=4041.52
## 
## Training set error measures:
##                    ME    RMSE     MAE        MPE    MAPE      MASE        ACF1
## Training set 3.525461 73.2823 35.7457 -0.5631361 8.50513 0.6973159 0.007914683
coeftest(mod10HC) #sar1 not sig
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ar1   1.309549   0.097003  13.5001 < 2.2e-16 ***
## ar2  -0.324837   0.086072  -3.7740 0.0001606 ***
## ma1  -0.831413   0.070611 -11.7746 < 2.2e-16 ***
## sar1  0.059780   0.072215   0.8278 0.4077815    
## sma1 -0.903524   0.047553 -19.0004 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tsplot(mod10HC$residuals) 

adf.test(mod10HC$residuals)
## Warning in adf.test(mod10HC$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod10HC$residuals
## Dickey-Fuller = -6.0792, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod10HC$residuals)
## Warning in pp.test(mod10HC$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod10HC$residuals
## Dickey-Fuller Z(alpha) = -349.75, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod10HC$residuals) 
## Warning in kpss.test(mod10HC$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod10HC$residuals
## KPSS Level = 0.19902, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod10HC$residuals)) #passed all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -224.21  -23.35    0.86   21.22  850.25 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.00190    0.11980  -8.363 1.47e-15 ***
## yd.diff.lag1  0.01610    0.10763   0.150    0.881    
## yd.diff.lag2 -0.03669    0.09366  -0.392    0.695    
## yd.diff.lag3  0.01159    0.07531   0.154    0.878    
## yd.diff.lag4  0.01876    0.05363   0.350    0.727    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74.14 on 350 degrees of freedom
## Multiple R-squared:  0.4965, Adjusted R-squared:  0.4893 
## F-statistic: 69.03 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -8.3629 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod10HC$residuals, lag=50) #looks good

pacf(mod10HC$residuals, lag=50) #looks good

#add regular diff.
mod11HC=Arima(allHC$hate_crime,order=c(2,1,1), seasonal=list(order=c(1,1,1),period=12), include.drift = FALSE)
summary(mod11HC) #AIC=4010.96   AICc=4011.21   BIC=4034.06, got better
## Series: allHC$hate_crime 
## ARIMA(2,1,1)(1,1,1)[12] 
## 
## Coefficients:
##          ar1      ar2      ma1    sar1     sma1
##       0.3276  -0.0513  -0.8304  0.0462  -0.8973
## s.e.  0.0832   0.0707   0.0658  0.0715   0.0477
## 
## sigma^2 = 5680:  log likelihood = -1999.48
## AIC=4010.96   AICc=4011.21   BIC=4034.06
## 
## Training set error measures:
##                     ME   RMSE      MAE       MPE     MAPE      MASE
## Training set -3.764594 73.457 35.93096 -2.124327 8.600325 0.7009299
##                       ACF1
## Training set -9.150525e-05
coeftest(mod11HC) #two terms not sig. ar2, sar1
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ar1   0.327572   0.083181   3.9380 8.215e-05 ***
## ar2  -0.051287   0.070706  -0.7254    0.4682    
## ma1  -0.830389   0.065779 -12.6239 < 2.2e-16 ***
## sar1  0.046235   0.071496   0.6467    0.5178    
## sma1 -0.897290   0.047750 -18.7914 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tsplot(mod11HC$residuals) 

adf.test(mod11HC$residuals)
## Warning in adf.test(mod11HC$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod11HC$residuals
## Dickey-Fuller = -6.6266, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod11HC$residuals)
## Warning in pp.test(mod11HC$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod11HC$residuals
## Dickey-Fuller Z(alpha) = -356.48, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod11HC$residuals) #0.02322
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod11HC$residuals
## KPSS Level = 0.59353, Truncation lag parameter = 5, p-value = 0.02322
summary(ur.ers(mod11HC$residuals)) 
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -238.14  -28.44   -2.77   17.99  843.65 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.98769    0.11889  -8.308 2.17e-15 ***
## yd.diff.lag1 -0.01088    0.10704  -0.102    0.919    
## yd.diff.lag2 -0.02948    0.09365  -0.315    0.753    
## yd.diff.lag3  0.02252    0.07598   0.296    0.767    
## yd.diff.lag4  0.02249    0.05373   0.419    0.676    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74.29 on 350 degrees of freedom
## Multiple R-squared:  0.5005, Adjusted R-squared:  0.4934 
## F-statistic: 70.15 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -8.3076 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod11HC$residuals, lag=50) #looks good

pacf(mod11HC$residuals, lag=50) #looks good, AIC and BIC improved but compared to mod9HC, this didn't passed one stationarity test and two terms were not significant. Thus, we will use mod9HC as best model.

#comparing 7 different SARIMA model, our best model is
mod9HC=Arima(allHC$hate_crime,order=c(2,0,1), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
summary(mod9HC) #AIC=4017.1   AICc=4017.27   BIC=4036.36
## Series: allHC$hate_crime 
## ARIMA(2,0,1)(0,1,1)[12] 
## 
## Coefficients:
##          ar1      ar2      ma1     sma1
##       1.3100  -0.3239  -0.8332  -0.8858
## s.e.  0.0924   0.0832   0.0658   0.0443
## 
## sigma^2 = 5642:  log likelihood = -2003.55
## AIC=4017.1   AICc=4017.27   BIC=4036.36
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 3.504163 73.42226 35.84429 -0.5534322 8.520611 0.6992392
##                     ACF1
## Training set 0.008615474
coeftest(mod9HC)
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ar1   1.309959   0.092409  14.1757 < 2.2e-16 ***
## ar2  -0.323904   0.083188  -3.8936 9.875e-05 ***
## ma1  -0.833242   0.065763 -12.6704 < 2.2e-16 ***
## sma1 -0.885843   0.044287 -20.0022 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cor(fitted(mod9HC), allHC$hate_crime)^2 #0.5787597
## [1] 0.5787597
#check normality
qqPlot(mod9HC$residuals) #not normal, outliers

## [1] 129 354
densityplot(as.numeric(mod9HC$residuals)) #some outliers

shapiro.test(mod9HC$residuals) #not normal, we can't trust forecast interval
## 
##  Shapiro-Wilk normality test
## 
## data:  mod9HC$residuals
## W = 0.56305, p-value < 2.2e-16

Best model is SARIMA (2,0,1)x(0,1,1).

Compare function of time model and SARIMA model.

#best function of time model: seasonal model
t=1:360
mod5HC=lm(hate_crime~t+as.factor(month)+0, data=allHC)
#summary(mod5HC) #Adjusted R-squared:  0.2464 #better than all above
#AIC(mod5HC) #4336.28
#BIC(mod5HC) #4390.685

#best SARIMA model
mod9HC=Arima(allHC$hate_crime,order=c(2,0,1), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
#summary(mod9HC) #AIC=4017.1   AICc=4017.27   BIC=4036.36

#compare forecasting performance of two models
#str(allHC)

a=allHC[1:348,] #including only through 2018
t=1:348
foftimeMod = lm(hate_crime~t+as.factor(month)+0, data=a)
AIC(foftimeMod) #4066.171
## [1] 4066.171
BIC(foftimeMod) #4120.102
## [1] 4120.102
foftimePred= predict(foftimeMod, data.frame(t=349:360, month=1:12))

sarimaMod=Arima(a$hate_crime,order=c(2,0,1), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE) 
#summary(sarimaMod) #AIC=3780.98   AICc=3781.16   BIC=3800.06
sarimaPred=forecast(sarimaMod,h=12)$mean

#real data
real_value=allHC$hate_crime[c(349:360)]

#compare with real data
cor(real_value,foftimePred)^2 # R^2 = 0.2608992
## [1] 0.2608992
cor(real_value,sarimaPred)^2 # R^2 = 0.3221583, this has best prediction
## [1] 0.3221583

SARIMA model is better.

Prediction using the best SARIMA model

#plot prediction
pred_allHC=forecast(mod9HC,h=24) #twp years, 2021, 2022
plot(pred_allHC, main="Prediction on all racial hate crimes", xlab="Time", ylab="Hate crimes")

forecasting2<-sarima.for(allHC$hate_crime, 2,0,1, 0,1,1, 12, n.ahead = 24)

forecasting2$pred
## Time Series:
## Start = 361 
## End = 384 
## Frequency = 1 
##  [1] 420.6226 441.0736 499.7341 503.8945 551.0491 599.8965 588.0991 569.8787
##  [9] 560.5525 550.7625 489.0558 430.3178 444.3148 449.4107 502.9406 505.4231
## [17] 552.0655 600.7935 589.0087 570.8446 561.5887 551.8730 490.2411 431.5774
pred_allHC
##     Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 361       416.4140 320.1489 512.6791 269.1892 563.6388
## 362       434.0076 327.3642 540.6510 270.9106 597.1046
## 363       490.6057 380.1072 601.1042 321.6128 659.5986
## 364       493.0561 380.1816 605.9305 320.4295 665.6826
## 365       538.5774 423.8014 653.3534 363.0427 714.1122
## 366       585.6799 469.2097 702.1501 407.5541 763.8057
## 367       572.5839 454.5436 690.6243 392.0569 753.1110
## 368       553.0345 433.5182 672.5509 370.2501 735.8190
## 369       542.4946 421.5833 663.4060 357.5767 727.4126
## 370       531.3875 409.1547 653.6203 344.4485 718.3264
## 371       468.3682 344.8821 591.8543 279.5125 657.2239
## 372       408.3875 283.7119 533.0631 217.7126 599.0624
## 373       420.7937 293.0511 548.5363 225.4282 616.1591
## 374       424.3957 294.8201 553.9714 226.2269 622.5645
## 375       476.5960 345.5707 607.6212 276.2101 676.9818
## 376       477.8172 345.5026 610.1318 275.4595 680.1749
## 377       523.1530 389.6408 656.6651 318.9638 727.3422
## 378       570.4104 435.7687 705.0521 364.4937 776.3271
## 379       557.5776 421.8645 693.2906 350.0224 765.1328
## 380       538.3227 401.5912 675.0541 329.2100 747.4353
## 381       528.0833 390.3829 665.7836 317.4887 738.6778
## 382       517.2744 378.6515 655.8973 305.2690 729.2798
## 383       454.5486 315.0470 594.0501 241.1994 667.8978
## 384       394.8556 254.5170 535.1942 180.2263 609.4850

2. Black hate crime

Get time series plots for anit-African American or Black hate crimes

summary(blackHC)
##       year          month         hate_crime      difference       
##  Min.   :1991   Min.   : 1.00   Min.   : 88.0   Min.   :-206.0000  
##  1st Qu.:1998   1st Qu.: 3.75   1st Qu.:167.0   1st Qu.: -22.0000  
##  Median :2006   Median : 6.50   Median :202.0   Median :   2.0000  
##  Mean   :2006   Mean   : 6.50   Mean   :207.7   Mean   :   0.2444  
##  3rd Qu.:2013   3rd Qu.: 9.25   3rd Qu.:242.5   3rd Qu.:  21.0000  
##  Max.   :2020   Max.   :12.00   Max.   :693.0   Max.   : 456.0000
ts3 <- xts(blackHC$hate_crime, as.POSIXct(sprintf("%d-%d-01", blackHC$year, blackHC$month)))
ts4 <- xts(blackHC$hate_crime, as.yearmon(blackHC$year + (blackHC$month-1)/12))

plot(ts4, main="Anti-African American or Black Hate Crimes")

#par(mfrow=1:2)
#for (i in 1:30){
#  start=12*(i-1)+1
#  end=12*i
#  print(plot(ts4[start:end]))
#}

Fit function of time model

#start with linear model
t=1:360
mod1B=lm(hate_crime~t, data=blackHC)
summary(mod1B) #Adjusted R-squared: 0.07153  
## 
## Call:
## lm(formula = hate_crime ~ t, data = blackHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -148.47  -35.69   -3.80   27.88  513.16 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 236.63067    6.24207  37.909  < 2e-16 ***
## t            -0.16043    0.02997  -5.353 1.55e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59.09 on 358 degrees of freedom
## Multiple R-squared:  0.07411,    Adjusted R-squared:  0.07153 
## F-statistic: 28.66 on 1 and 358 DF,  p-value: 1.547e-07
#checking stationarity
plot(mod1B$residuals) #doesn't look stationray, fit quadratic trend
lines(lowess(mod1B$residuals))

#quadratic model
mod2B=lm(hate_crime~t+I(t^2), data=blackHC)
summary(mod2B) #Adjusted R-squared:  0.08305 
## 
## Call:
## lm(formula = hate_crime ~ t + I(t^2), data = blackHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -132.38  -35.48   -4.16   27.64  527.67 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.203e+02  9.337e+00  23.590   <2e-16 ***
## t            1.108e-01  1.194e-01   0.928   0.3541    
## I(t^2)      -7.514e-04  3.204e-04  -2.345   0.0196 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 58.73 on 357 degrees of freedom
## Multiple R-squared:  0.08816,    Adjusted R-squared:  0.08305 
## F-statistic: 17.26 on 2 and 357 DF,  p-value: 7.005e-08
#tsplot(mod2B$residuals) #didn't changed much from the original tsplot
plot(mod2B$residuals) #doesn't look stationray, seasonality
lines(lowess(mod2B$residuals))

#cosine model
blackHC$Xcos=cos(2*pi*t/12)
blackHC$Xsin=sin(2*pi*t/12)

mod3B=lm(hate_crime~t+I(t^2)+Xcos+Xsin,data=blackHC)
summary(mod3B) #Adjusted R-squared:  0.228 , improved. t and t^2 not sig.
## 
## Call:
## lm(formula = hate_crime ~ t + I(t^2) + Xcos + Xsin, data = blackHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -135.72  -31.35   -4.31   24.78  498.30 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.206e+02  8.569e+00  25.740  < 2e-16 ***
## t            1.086e-01  1.096e-01   0.991 0.322434    
## I(t^2)      -7.489e-04  2.940e-04  -2.547 0.011283 *  
## Xcos        -2.954e+01  4.016e+00  -7.355 1.33e-12 ***
## Xsin        -1.551e+01  4.018e+00  -3.861 0.000134 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53.89 on 355 degrees of freedom
## Multiple R-squared:  0.2366, Adjusted R-squared:  0.228 
## F-statistic:  27.5 on 4 and 355 DF,  p-value: < 2.2e-16
tsplot(mod3B$residuals) #looks like it got rid of seasonal trend, but still not stationary

plot(mod3B$residuals)
lines(lowess(mod3B$residuals))#better than two models from above

#cosine model, dropping trend terms
mod4B=lm(hate_crime~Xcos+Xsin,data=blackHC)
summary(mod4B) #Adjusted R-squared:  0.1426 not better than above
## 
## Call:
## lm(formula = hate_crime ~ Xcos + Xsin, data = blackHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -122.39  -37.79   -4.44   28.84  455.61 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  207.672      2.993  69.386  < 2e-16 ***
## Xcos         -29.715      4.233  -7.020 1.12e-11 ***
## Xsin         -14.912      4.233  -3.523 0.000482 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56.79 on 357 degrees of freedom
## Multiple R-squared:  0.1474, Adjusted R-squared:  0.1426 
## F-statistic: 30.85 on 2 and 357 DF,  p-value: 4.387e-13
tsplot(mod4B$residuals) #looks like it got rid of seasonal trend, but still not stationary

plot(mod4B$residuals)
lines(lowess(mod4B$residuals))#better than first two models from above

#seasonal model
mod5B=lm(hate_crime~t+as.factor(month)+0, data=blackHC)
summary(mod5B) #Adjusted R-squared:  0.9388 #better than all above
## 
## Call:
## lm(formula = hate_crime ~ t + as.factor(month) + 0, data = blackHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -143.46  -30.01   -1.42   23.95  490.53 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## t                   -0.16089    0.02718  -5.919 7.76e-09 ***
## as.factor(month)1  203.22164   10.87447  18.688  < 2e-16 ***
## as.factor(month)2  205.68252   10.88638  18.894  < 2e-16 ***
## as.factor(month)3  239.81008   10.89835  22.004  < 2e-16 ***
## as.factor(month)4  241.03763   10.91038  22.093  < 2e-16 ***
## as.factor(month)5  252.39851   10.92246  23.108  < 2e-16 ***
## as.factor(month)6  259.42607   10.93459  23.725  < 2e-16 ***
## as.factor(month)7  261.62028   10.94678  23.899  < 2e-16 ***
## as.factor(month)8  263.08117   10.95902  24.006  < 2e-16 ***
## as.factor(month)9  254.14206   10.97131  23.164  < 2e-16 ***
## as.factor(month)10 261.03627   10.98366  23.766  < 2e-16 ***
## as.factor(month)11 217.43049   10.99606  19.773  < 2e-16 ***
## as.factor(month)12 181.65805   11.00852  16.502  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53.56 on 347 degrees of freedom
## Multiple R-squared:  0.941,  Adjusted R-squared:  0.9388 
## F-statistic: 425.8 on 13 and 347 DF,  p-value: < 2.2e-16
tsplot(mod5B$residuals) #looks like it got rid of seasonal trend, but still not stationary

plot(mod5B$residuals)
lines(lowess(mod5B$residuals))#better than first two models from above

AIC(mod5B)
## [1] 3902.605
BIC(mod5B)
## [1] 3957.01
#none of the residuals looks linear. try SARIMA

None of the model fixed issue of non-linearity observed from the residuals. We cannot use function of time models for the prediction.

Check stationarity of the data and transform data for SARIMA model fitting

tsplot(blackHC$hate_crime) #doesn't look stationary

adf.test(blackHC$hate_crime) 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  blackHC$hate_crime
## Dickey-Fuller = -3.8531, Lag order = 7, p-value = 0.01662
## alternative hypothesis: stationary
pp.test(blackHC$hate_crime)
## Warning in pp.test(blackHC$hate_crime): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  blackHC$hate_crime
## Dickey-Fuller Z(alpha) = -94.405, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(blackHC$hate_crime) #fail
## Warning in kpss.test(blackHC$hate_crime): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  blackHC$hate_crime
## KPSS Level = 1.2825, Truncation lag parameter = 5, p-value = 0.01
summary(ur.ers(blackHC$hate_crime))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -107.84  -17.82    4.91   25.28  472.06 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## yd.lag       -0.03800    0.01933  -1.965  0.05017 . 
## yd.diff.lag1 -0.15915    0.05508  -2.889  0.00410 **
## yd.diff.lag2 -0.17240    0.05527  -3.119  0.00196 **
## yd.diff.lag3 -0.13262    0.05471  -2.424  0.01585 * 
## yd.diff.lag4 -0.07799    0.05443  -1.433  0.15277   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41.85 on 350 degrees of freedom
## Multiple R-squared:  0.07575,    Adjusted R-squared:  0.06255 
## F-statistic: 5.737 on 5 and 350 DF,  p-value: 4.187e-05
## 
## 
## Value of test-statistic is: -1.9653 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(blackHC$hate_crime, lag=100) #doesn't look stationary, might need seasonal differencing

tsplot(diff(blackHC$hate_crime, lag=12)) #looks better but some concern with variance

adf.test(diff(blackHC$hate_crime, lag=12)) 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(blackHC$hate_crime, lag = 12)
## Dickey-Fuller = -3.9586, Lag order = 7, p-value = 0.01137
## alternative hypothesis: stationary
pp.test(diff(blackHC$hate_crime, lag=12))
## Warning in pp.test(diff(blackHC$hate_crime, lag = 12)): p-value smaller than
## printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff(blackHC$hate_crime, lag = 12)
## Dickey-Fuller Z(alpha) = -129.26, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(blackHC$hate_crime, lag=12)) 
## Warning in kpss.test(diff(blackHC$hate_crime, lag = 12)): p-value greater than
## printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff(blackHC$hate_crime, lag = 12)
## KPSS Level = 0.28315, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(diff(blackHC$hate_crime, lag=12))) #pass all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -126.53  -26.14   -4.49   14.71  479.16 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.045894   0.022544  -2.036  0.04255 *  
## yd.diff.lag1 -0.305953   0.056922  -5.375 1.43e-07 ***
## yd.diff.lag2 -0.292114   0.058250  -5.015 8.59e-07 ***
## yd.diff.lag3 -0.182152   0.057354  -3.176  0.00163 ** 
## yd.diff.lag4 -0.006499   0.055326  -0.117  0.90656    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.99 on 338 degrees of freedom
## Multiple R-squared:  0.1564, Adjusted R-squared:  0.1439 
## F-statistic: 12.53 on 5 and 338 DF,  p-value: 3.562e-11
## 
## 
## Value of test-statistic is: -2.0358 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(diff(blackHC$hate_crime, lag=12), lag=100) #doesn't look stationary, might need regular differencing?

tsplot(diff(diff(blackHC$hate_crime, lag=12)))

adf.test(diff(diff(blackHC$hate_crime, lag=12)))
## Warning in adf.test(diff(diff(blackHC$hate_crime, lag = 12))): p-value smaller
## than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(diff(blackHC$hate_crime, lag = 12))
## Dickey-Fuller = -9.2152, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(diff(blackHC$hate_crime, lag=12)))
## Warning in pp.test(diff(diff(blackHC$hate_crime, lag = 12))): p-value smaller
## than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff(diff(blackHC$hate_crime, lag = 12))
## Dickey-Fuller Z(alpha) = -342.11, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(diff(blackHC$hate_crime, lag=12)))
## Warning in kpss.test(diff(diff(blackHC$hate_crime, lag = 12))): p-value greater
## than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff(diff(blackHC$hate_crime, lag = 12))
## KPSS Level = 0.025647, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(diff(diff(blackHC$hate_crime, lag=12)))) #pass all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -129.48   -9.84   11.11   32.03  491.58 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.11638    0.14955  -7.465 7.17e-13 ***
## yd.diff.lag1 -0.07589    0.13434  -0.565  0.57250    
## yd.diff.lag2 -0.21318    0.11169  -1.909  0.05715 .  
## yd.diff.lag3 -0.22730    0.08527  -2.666  0.00805 ** 
## yd.diff.lag4 -0.07329    0.05512  -1.330  0.18455    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.16 on 337 degrees of freedom
## Multiple R-squared:  0.5991, Adjusted R-squared:  0.5931 
## F-statistic: 100.7 on 5 and 337 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -7.4651 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(diff(diff(blackHC$hate_crime, lag=12))) #no huge seasonal pattern. do both diff.

tsplot(log(blackHC$hate_crime)) #doesn't fix the issue with non-constant variance

Fit appropriate SARIMA model

#black hate crime
tsplot(diff(diff(blackHC$hate_crime, lag=12)))

acf(diff(diff(blackHC$hate_crime, lag=12)), lag=50) #lag 1,2,12. MA(2) and SMA(1)

pacf(diff(diff(blackHC$hate_crime, lag=12)), lag=50) #cut off at 12, SAR(1)

#1. start with SMA(1) included
mod6B=Arima(blackHC$hate_crime,order=c(0,1,0), seasonal=list(order=c(0,1,1),period=12), include.drift = TRUE) #has warning, can't include drift if we do 2 differencing
## Warning in Arima(blackHC$hate_crime, order = c(0, 1, 0), seasonal = list(order =
## c(0, : No drift term fitted as the order of difference is 2 or more.
mod6.2B=Arima(blackHC$hate_crime,order=c(0,1,0), seasonal=list(order=c(0,1,1),period=12), include.drift =FALSE)
summary(mod6.2B) #AIC=3535.3   AICc=3535.34   BIC=3543
## Series: blackHC$hate_crime 
## ARIMA(0,1,0)(0,1,1)[12] 
## 
## Coefficients:
##          sma1
##       -0.7664
## s.e.   0.0574
## 
## sigma^2 = 1497:  log likelihood = -1765.65
## AIC=3535.3   AICc=3535.34   BIC=3543
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.6671603 37.92833 22.38026 -1.091368 10.36432 0.7878519
##                    ACF1
## Training set -0.2021082
tsplot(mod6.2B$residuals) #fairly okay

adf.test(mod6.2B$residuals) #stationary
## Warning in adf.test(mod6.2B$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod6.2B$residuals
## Dickey-Fuller = -9.6037, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod6.2B$residuals) #stationary
## Warning in pp.test(mod6.2B$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod6.2B$residuals
## Dickey-Fuller Z(alpha) = -341.02, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod6.2B$residuals) #stationary
## Warning in kpss.test(mod6.2B$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod6.2B$residuals
## KPSS Level = 0.098712, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod6.2B$residuals)) #stationary
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -123.35  -14.85    2.22   15.69  466.26 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -2.14489    0.18504 -11.591  < 2e-16 ***
## yd.diff.lag1  0.82165    0.16083   5.109 5.34e-07 ***
## yd.diff.lag2  0.46843    0.12803   3.659 0.000292 ***
## yd.diff.lag3  0.22233    0.09161   2.427 0.015727 *  
## yd.diff.lag4  0.13157    0.05497   2.394 0.017213 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.44 on 350 degrees of freedom
## Multiple R-squared:  0.6458, Adjusted R-squared:  0.6408 
## F-statistic: 127.6 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -11.5913 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod6.2B$residuals, lag=100) #ma(2) most obvious

pacf(mod6.2B$residuals, lag=100) #1,2,3 significant

#2. add MA(2)
mod7B=Arima(blackHC$hate_crime,order=c(0,1,2), seasonal=list(order=c(0,1,1),period=12), include.drift =FALSE)
summary(mod7B) #AIC=3478.41   AICc=3478.53   BIC=3493.81
## Series: blackHC$hate_crime 
## ARIMA(0,1,2)(0,1,1)[12] 
## 
## Coefficients:
##           ma1      ma2     sma1
##       -0.3883  -0.2895  -0.7777
## s.e.   0.0498   0.0497   0.0571
## 
## sigma^2 = 1259:  log likelihood = -1735.2
## AIC=3478.41   AICc=3478.53   BIC=3493.81
## 
## Training set error measures:
##                     ME    RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set -1.088744 34.6865 19.38158 -1.575956 9.117224 0.6822895 0.02488984
tsplot(mod7B$residuals) #fairly okay

adf.test(mod7B$residuals) #stationary
## Warning in adf.test(mod7B$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod7B$residuals
## Dickey-Fuller = -6.8834, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod7B$residuals) #stationary
## Warning in pp.test(mod7B$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod7B$residuals
## Dickey-Fuller Z(alpha) = -344.37, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod7B$residuals) #0.042, failed
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod7B$residuals
## KPSS Level = 0.49854, Truncation lag parameter = 5, p-value = 0.042
summary(ur.ers(mod7B$residuals)) #stationary
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -104.98  -15.68    1.07   14.59  467.60 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.01741    0.11879  -8.565 3.48e-16 ***
## yd.diff.lag1  0.05184    0.10699   0.485    0.628    
## yd.diff.lag2  0.08170    0.09176   0.890    0.374    
## yd.diff.lag3  0.02724    0.07583   0.359    0.720    
## yd.diff.lag4  0.07757    0.05431   1.428    0.154    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.94 on 350 degrees of freedom
## Multiple R-squared:  0.4895, Adjusted R-squared:  0.4822 
## F-statistic: 67.12 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -8.5651 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod7B$residuals, lag=50) #looks okay

pacf(mod7B$residuals, lag=50) #looks okay

#3. add additional terms to see if our current model is the best. 
#add SAR(1)
mod8B=Arima(blackHC$hate_crime,order=c(0,1,2), seasonal=list(order=c(1,1,1),period=12), include.drift =FALSE)
summary(mod8B) #AIC=3480.22   AICc=3480.4   BIC=3499.47 #got worse
## Series: blackHC$hate_crime 
## ARIMA(0,1,2)(1,1,1)[12] 
## 
## Coefficients:
##           ma1      ma2     sar1     sma1
##       -0.3887  -0.2889  -0.0429  -0.7609
## s.e.   0.0499   0.0497   0.0990   0.0713
## 
## sigma^2 = 1262:  log likelihood = -1735.11
## AIC=3480.22   AICc=3480.4   BIC=3499.47
## 
## Training set error measures:
##                     ME     RMSE     MAE       MPE    MAPE      MASE       ACF1
## Training set -1.050696 34.67547 19.3509 -1.557614 9.10181 0.6812094 0.02457845
#add AR(1)
mod9B=Arima(blackHC$hate_crime,order=c(1,1,2), seasonal=list(order=c(0,1,1),period=12), include.drift =FALSE)
summary(mod9B) #AIC=3478.09   AICc=3478.27   BIC=3497.34 #AIC slightly improved, BIC got much worse
## Series: blackHC$hate_crime 
## ARIMA(1,1,2)(0,1,1)[12] 
## 
## Coefficients:
##          ar1      ma1      ma2     sma1
##       0.2614  -0.6237  -0.1694  -0.7739
## s.e.  0.1692   0.1669   0.1045   0.0580
## 
## sigma^2 = 1254:  log likelihood = -1734.05
## AIC=3478.09   AICc=3478.27   BIC=3497.34
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -1.131123 34.57228 19.40483 -1.590025 9.119137 0.6831079
##                      ACF1
## Training set 0.0006896171
tsplot(mod9B$residuals) #fairly okay

adf.test(mod9B$residuals) #stationary
## Warning in adf.test(mod9B$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod9B$residuals
## Dickey-Fuller = -6.5631, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod9B$residuals) #stationary
## Warning in pp.test(mod9B$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod9B$residuals
## Dickey-Fuller Z(alpha) = -356.51, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod9B$residuals) #0.02246, failed
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod9B$residuals
## KPSS Level = 0.60193, Truncation lag parameter = 5, p-value = 0.02246
summary(ur.ers(mod9B$residuals)) #stationary
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -104.32  -15.66    0.82   14.12  467.89 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.97879    0.11986  -8.166 5.85e-15 ***
## yd.diff.lag1 -0.01544    0.10871  -0.142    0.887    
## yd.diff.lag2 -0.01977    0.09379  -0.211    0.833    
## yd.diff.lag3 -0.02640    0.07638  -0.346    0.730    
## yd.diff.lag4  0.05050    0.05403   0.935    0.351    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.9 on 350 degrees of freedom
## Multiple R-squared:  0.5005, Adjusted R-squared:  0.4934 
## F-statistic: 70.14 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -8.1661 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod9B$residuals, lag=100) #looks okay

pacf(mod9B$residuals, lag=100) #looks okay

#add SMA(1)
mod10B=Arima(blackHC$hate_crime,order=c(0,1,2), seasonal=list(order=c(0,1,2),period=12), include.drift =FALSE)
summary(mod10B) #AIC=3480.16   AICc=3480.33   BIC=3499.4, got worse
## Series: blackHC$hate_crime 
## ARIMA(0,1,2)(0,1,2)[12] 
## 
## Coefficients:
##           ma1      ma2     sma1    sma2
##       -0.3890  -0.2888  -0.8128  0.0452
## s.e.   0.0499   0.0497   0.0909  0.0899
## 
## sigma^2 = 1262:  log likelihood = -1735.08
## AIC=3480.16   AICc=3480.33   BIC=3499.4
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -1.032989 34.67153 19.34546 -1.549093 9.099475 0.6810178
##                    ACF1
## Training set 0.02453064
#4. fit auto.arima
mod11B=auto.arima(blackHC$hate_crime)
summary(mod11B) #(2,1,2), AIC=3664.08   AICc=3664.25   BIC=3683.49
## Series: blackHC$hate_crime 
## ARIMA(2,1,2) 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2
##       1.5493  -0.7173  -1.8274  0.8925
## s.e.  0.0742   0.0549   0.0705  0.0668
## 
## sigma^2 = 1553:  log likelihood = -1827.04
## AIC=3664.08   AICc=3664.25   BIC=3683.49
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 1.761113 39.12775 25.68421 -1.255502 12.53004 0.9041606
##                     ACF1
## Training set -0.03501769
tsplot(mod11B$residuals) #fairly okay

adf.test(mod11B$residuals) #stationary
## Warning in adf.test(mod11B$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod11B$residuals
## Dickey-Fuller = -6.995, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod11B$residuals) #stationary
## Warning in pp.test(mod11B$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod11B$residuals
## Dickey-Fuller Z(alpha) = -377.87, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod11B$residuals) #stationary
## Warning in kpss.test(mod11B$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod11B$residuals
## KPSS Level = 0.12662, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod11B$residuals)) #stationary
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96.39 -18.44   1.32  21.63 461.92 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.94634    0.12071  -7.840 5.49e-14 ***
## yd.diff.lag1 -0.08654    0.10956  -0.790    0.430    
## yd.diff.lag2 -0.10086    0.09547  -1.056    0.291    
## yd.diff.lag3 -0.07328    0.07746  -0.946    0.345    
## yd.diff.lag4 -0.01334    0.05390  -0.247    0.805    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.57 on 350 degrees of freedom
## Multiple R-squared:  0.5154, Adjusted R-squared:  0.5085 
## F-statistic: 74.45 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -7.8398 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod11B$residuals, lag=100) #some seasonal pattern

pacf(mod11B$residuals, lag=100) #some seasonal lag, not good

#comparing 6 different SARIMA model, our best model is
mod7B=Arima(blackHC$hate_crime,order=c(0,1,2), seasonal=list(order=c(0,1,1),period=12), include.drift =FALSE)
summary(mod7B) #AIC=3478.41   AICc=3478.53   BIC=3493.81
## Series: blackHC$hate_crime 
## ARIMA(0,1,2)(0,1,1)[12] 
## 
## Coefficients:
##           ma1      ma2     sma1
##       -0.3883  -0.2895  -0.7777
## s.e.   0.0498   0.0497   0.0571
## 
## sigma^2 = 1259:  log likelihood = -1735.2
## AIC=3478.41   AICc=3478.53   BIC=3493.81
## 
## Training set error measures:
##                     ME    RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set -1.088744 34.6865 19.38158 -1.575956 9.117224 0.6822895 0.02488984
coeftest(mod7B)
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ma1  -0.388306   0.049826  -7.7932 6.532e-15 ***
## ma2  -0.289509   0.049716  -5.8233 5.769e-09 ***
## sma1 -0.777704   0.057065 -13.6285 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cor(fitted(mod7B), blackHC$hate_crime)^2 #0.6875213
## [1] 0.6875213
#check normality
qqPlot(mod7B$residuals) #fairly normal but some outliers

## [1] 354  37
densityplot(as.numeric(mod7B$residuals)) #some outliers

shapiro.test(mod7B$residuals) #not normal, we can't trust forecast interval unless dropping outliers and test them again
## 
##  Shapiro-Wilk normality test
## 
## data:  mod7B$residuals
## W = 0.63638, p-value < 2.2e-16

Best model is SARIMA (0,1,2)x(0,1,1).

Compare function of time model and SARIMA model

t=1:360
#best function of time model: seasonal model
mod5B=lm(hate_crime~t+as.factor(month)+0, data=blackHC)
#summary(mod5B) #Adjusted R-squared:  0.9388 
AIC(mod5B) #3902.605
## [1] 3902.605
BIC(mod5B) #3957.01
## [1] 3957.01
#best SARIMA model
mod7B=Arima(blackHC$hate_crime,order=c(0,1,2), seasonal=list(order=c(0,1,1),period=12), include.drift =FALSE)
#summary(mod7B) #AIC=3478.41   AICc=3478.53   BIC=3493.81

#compare forecasting performance of two models
b=blackHC[1:348,] #including only through 2018
t=1:348
foftimeModB = lm(hate_crime~t+as.factor(month)+0, data=b)
AIC(foftimeModB) #3584.569
## [1] 3584.569
BIC(foftimeModB) #3638.5
## [1] 3638.5
foftimePredB= predict(foftimeModB, data.frame(t=349:360, month=1:12))

sarimaModB=Arima(b$hate_crime,order=c(0,1,2), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE) 
#summary(sarimaModB) #AIC=3072.28   AICc=3072.4   BIC=3087.53
sarimaBPred=forecast(sarimaModB,h=12)$mean

#real data
real_value2=blackHC$hate_crime[c(349:360)]

#compare with real data
cor(real_value2,foftimePredB)^2 # R^2 = 0.2092973
## [1] 0.2092973
cor(real_value2,sarimaBPred)^2 # R^2 = 0.3096983, this has better prediction
## [1] 0.3096983

SARIMA model is better.

Prediction using the best SARIMA model

#plot prediction
pred_blackHC=forecast(mod7B,h=24) #two years, 2021 and 2022
pred_blackHC
##     Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 361       225.7696 180.2951 271.2442 156.2223 295.3170
## 362       262.9636 209.6561 316.2712 181.4367 344.4905
## 363       274.7727 219.4884 330.0570 190.2226 359.3227
## 364       279.5197 222.3269 336.7125 192.0509 366.9885
## 365       307.4694 248.4298 366.5090 217.1762 397.7627
## 366       406.3728 345.5424 467.2031 313.3408 499.4048
## 367       367.2499 304.6801 429.8198 271.5576 462.9423
## 368       352.8507 288.5883 417.1130 254.5699 451.1314
## 369       327.7986 261.8872 393.7099 226.9959 428.6013
## 370       326.3496 258.8295 393.8697 223.0865 429.6127
## 371       296.7755 227.6841 365.8669 191.1094 402.4417
## 372       260.7437 190.1159 331.3715 152.7279 368.7595
## 373       270.0399 195.1978 344.8820 155.5787 384.5010
## 374       288.0078 210.3198 365.6958 169.1943 406.8214
## 375       299.8169 220.0916 379.5422 177.8875 421.7463
## 376       304.5639 222.8521 386.2758 179.5964 429.5314
## 377       332.5136 248.8624 416.1649 204.5801 460.4471
## 378       431.4170 345.8703 516.9636 300.5847 562.2493
## 379       392.2941 304.8932 479.6951 258.6259 525.9624
## 380       377.8949 288.6781 467.1116 241.4496 514.3401
## 381       352.8428 261.8464 443.8391 213.6759 492.0097
## 382       351.3938 258.6520 444.1355 209.5575 493.2301
## 383       321.8198 227.3648 416.2747 177.3634 466.2761
## 384       285.7879 189.6503 381.9255 138.7582 432.8177
plot(pred_blackHC, main="Prediction on anti-African American or Black hate crimes", xlab="Time", ylab="Hate crimes")

forecasting3<-sarima.for(blackHC$hate_crime, 0,1,2, 0,1,1, 12, n.ahead = 24)

forecasting3$pred
## Time Series:
## Start = 361 
## End = 384 
## Frequency = 1 
##  [1] 225.7696 262.9636 274.7727 279.5197 307.4694 406.3728 367.2499 352.8507
##  [9] 327.7986 326.3496 296.7755 260.7437 270.0399 288.0078 299.8169 304.5639
## [17] 332.5136 431.4170 392.2941 377.8949 352.8428 351.3938 321.8198 285.7879

3. Anti-Asian hate crimes

Get time series plots for anti-Asian hate crimes

summary(asianHC)
##       year          month         hate_crime      difference       
##  Min.   :1991   Min.   : 1.00   Min.   : 3.00   Min.   :-26.00000  
##  1st Qu.:1998   1st Qu.: 3.75   1st Qu.:12.00   1st Qu.: -4.00000  
##  Median :2006   Median : 6.50   Median :17.00   Median :  0.00000  
##  Mean   :2006   Mean   : 6.50   Mean   :17.84   Mean   :  0.01944  
##  3rd Qu.:2013   3rd Qu.: 9.25   3rd Qu.:23.00   3rd Qu.:  4.00000  
##  Max.   :2020   Max.   :12.00   Max.   :51.00   Max.   : 39.00000
ts5<- xts(asianHC$hate_crime, as.POSIXct(sprintf("%d-%d-01", asianHC$year, asianHC$month)))
ts6 <- xts(asianHC$hate_crime, as.yearmon(asianHC$year + (asianHC$month-1)/12))

plot(ts6, main="Anti-Asian Hate Crimes")

#par(mfrow=1:2)
#for (i in 1:30){
#  start=12*(i-1)+1
#  end=12*i
#  print(plot(ts6[start:end]))
#}

Fit function of time model

#start with linear model
t=1:360
mod1A=lm(hate_crime~t, data=asianHC)
summary(mod1A) #Adjusted R-squared: 0.245 
## 
## Call:
## lm(formula = hate_crime ~ t, data = asianHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.500  -4.766  -1.392   3.695  39.923 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 25.008651   0.762691   32.79   <2e-16 ***
## t           -0.039691   0.003662  -10.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.22 on 358 degrees of freedom
## Multiple R-squared:  0.2471, Adjusted R-squared:  0.245 
## F-statistic: 117.5 on 1 and 358 DF,  p-value: < 2.2e-16
#checking stationarity
plot(mod1A$residuals) #doesn't look stationray, fit quadratic trend
lines(lowess(mod1A$residuals))

#quadratic model
mod2A=lm(hate_crime~t+I(t^2), data=asianHC)
summary(mod2A) #Adjusted R-squared:  0.2607
## 
## Call:
## lm(formula = hate_crime ~ t + I(t^2), data = asianHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.419  -4.548  -1.182   3.317  37.833 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.750e+01  1.136e+00  24.208  < 2e-16 ***
## t           -8.099e-02  1.453e-02  -5.573 4.94e-08 ***
## I(t^2)       1.144e-04  3.898e-05   2.934  0.00356 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.145 on 357 degrees of freedom
## Multiple R-squared:  0.2648, Adjusted R-squared:  0.2607 
## F-statistic:  64.3 on 2 and 357 DF,  p-value: < 2.2e-16
#tsplot(mod2B$residuals) #didn't changed much from the original tsplot
plot(mod2A$residuals) #doesn't look stationray
lines(lowess(mod2A$residuals))

#cosine model
asianHC$Xcos=cos(2*pi*t/12)
asianHC$Xsin=sin(2*pi*t/12)

mod3A=lm(hate_crime~t+I(t^2)+Xcos+Xsin,data=asianHC)
summary(mod3A) #Adjusted R-squared:  0.2773 , improved. sin term not sig.
## 
## Call:
## lm(formula = hate_crime ~ t + I(t^2) + Xcos + Xsin, data = asianHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.742  -4.693  -1.174   3.722  38.063 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.750e+01  1.123e+00  24.479  < 2e-16 ***
## t           -8.100e-02  1.437e-02  -5.637 3.52e-08 ***
## I(t^2)       1.145e-04  3.854e-05   2.971  0.00317 ** 
## Xcos        -1.666e+00  5.265e-01  -3.164  0.00169 ** 
## Xsin        -2.391e-01  5.267e-01  -0.454  0.65009    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.064 on 355 degrees of freedom
## Multiple R-squared:  0.2854, Adjusted R-squared:  0.2773 
## F-statistic: 35.44 on 4 and 355 DF,  p-value: < 2.2e-16
tsplot(mod3A$residuals) 

plot(mod3A$residuals)
lines(lowess(mod3A$residuals))#better than two models from above

#cosine model, dropping sine terms
mod4A=lm(hate_crime~t+I(t^2)+Xcos,data=asianHC)
summary(mod4A) #Adjusted R-squared:  0.2789 got slightly better
## 
## Call:
## lm(formula = hate_crime ~ t + I(t^2) + Xcos, data = asianHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.735  -4.699  -1.162   3.694  37.817 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.4890270  1.1219142  24.502  < 2e-16 ***
## t           -0.0809557  0.0143517  -5.641 3.45e-08 ***
## I(t^2)       0.0001145  0.0000385   2.975  0.00313 ** 
## Xcos        -1.6659519  0.5259498  -3.168  0.00167 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.056 on 356 degrees of freedom
## Multiple R-squared:  0.285,  Adjusted R-squared:  0.2789 
## F-statistic: 47.29 on 3 and 356 DF,  p-value: < 2.2e-16
tsplot(mod4A$residuals) #looks like it got rid of seasonal trend, but still not stationary

plot(mod4A$residuals)
lines(lowess(mod4A$residuals))#didn't get better

#seasonal model
mod5A=lm(hate_crime~0+t+as.factor(month)+0, data=asianHC)
summary(mod5A) #Adjusted R-squared:  0.8687 
## 
## Call:
## lm(formula = hate_crime ~ 0 + t + as.factor(month) + 0, data = asianHC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.096  -4.653  -1.517   3.939  38.639 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## t                  -0.039686   0.003619  -10.97   <2e-16 ***
## as.factor(month)1  22.778396   1.447780   15.73   <2e-16 ***
## as.factor(month)2  22.451415   1.449366   15.49   <2e-16 ***
## as.factor(month)3  26.291101   1.450960   18.12   <2e-16 ***
## as.factor(month)4  26.597454   1.452561   18.31   <2e-16 ***
## as.factor(month)5  27.037140   1.454169   18.59   <2e-16 ***
## as.factor(month)6  25.810159   1.455784   17.73   <2e-16 ***
## as.factor(month)7  25.883179   1.457407   17.76   <2e-16 ***
## as.factor(month)8  24.722865   1.459036   16.95   <2e-16 ***
## as.factor(month)9  25.529218   1.460673   17.48   <2e-16 ***
## as.factor(month)10 26.835570   1.462317   18.35   <2e-16 ***
## as.factor(month)11 23.975256   1.463968   16.38   <2e-16 ***
## as.factor(month)12 22.181609   1.465627   15.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.131 on 347 degrees of freedom
## Multiple R-squared:  0.8734, Adjusted R-squared:  0.8687 
## F-statistic: 184.2 on 13 and 347 DF,  p-value: < 2.2e-16
tsplot(mod5A$residuals) 

plot(mod5A$residuals)
lines(lowess(mod5A$residuals))#still not good, rather than detrending the data, might need to do differencing to make it stationary

None of the model fixed issue of non-linearity observed from the residuals. We cannot use function of time models for the prediction.

Check stationarity of the data and transform data for SARIMA model fitting

tsplot(asianHC$hate_crime) #doesn't look stationary

adf.test(asianHC$hate_crime) #fail
## 
##  Augmented Dickey-Fuller Test
## 
## data:  asianHC$hate_crime
## Dickey-Fuller = -3.3697, Lag order = 7, p-value = 0.05946
## alternative hypothesis: stationary
pp.test(asianHC$hate_crime)
## Warning in pp.test(asianHC$hate_crime): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  asianHC$hate_crime
## Dickey-Fuller Z(alpha) = -183.41, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(asianHC$hate_crime) #fail
## Warning in kpss.test(asianHC$hate_crime): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  asianHC$hate_crime
## KPSS Level = 2.9982, Truncation lag parameter = 5, p-value = 0.01
summary(ur.ers(asianHC$hate_crime))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.292  -3.265   0.156   3.992  36.536 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.07523    0.03104  -2.424  0.01586 *  
## yd.diff.lag1 -0.50220    0.05770  -8.704  < 2e-16 ***
## yd.diff.lag2 -0.34276    0.06209  -5.521 6.59e-08 ***
## yd.diff.lag3 -0.18076    0.06059  -2.984  0.00305 ** 
## yd.diff.lag4 -0.09133    0.05309  -1.720  0.08625 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.023 on 350 degrees of freedom
## Multiple R-squared:  0.2575, Adjusted R-squared:  0.2469 
## F-statistic: 24.27 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -2.4239 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(asianHC$hate_crime, lag=100) #doesn't look stationary, might need regular differencing

tsplot(diff(asianHC$hate_crime)) #looks better but some concern with variance

adf.test(diff(asianHC$hate_crime)) 
## Warning in adf.test(diff(asianHC$hate_crime)): p-value smaller than printed p-
## value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(asianHC$hate_crime)
## Dickey-Fuller = -10.455, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(asianHC$hate_crime))
## Warning in pp.test(diff(asianHC$hate_crime)): p-value smaller than printed p-
## value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff(asianHC$hate_crime)
## Dickey-Fuller Z(alpha) = -407.93, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(asianHC$hate_crime)) 
## Warning in kpss.test(diff(asianHC$hate_crime)): p-value greater than printed p-
## value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff(asianHC$hate_crime)
## KPSS Level = 0.028332, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(diff(asianHC$hate_crime))) #pass all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.207  -3.989  -0.429   3.231  36.600 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -2.65476    0.20266 -13.099  < 2e-16 ***
## yd.diff.lag1  1.07717    0.17667   6.097 2.86e-09 ***
## yd.diff.lag2  0.64847    0.14000   4.632 5.12e-06 ***
## yd.diff.lag3  0.36672    0.09704   3.779 0.000185 ***
## yd.diff.lag4  0.16308    0.05257   3.102 0.002078 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.988 on 349 degrees of freedom
## Multiple R-squared:  0.7367, Adjusted R-squared:  0.733 
## F-statistic: 195.3 on 5 and 349 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -13.0994 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(diff(asianHC$hate_crime), lag=100) #doesn't look stationary,but better than above. 

Fit appropriate SARIMA model

#asian hate crime
tsplot(diff(asianHC$hate_crime))

acf(diff(asianHC$hate_crime), lag=50) #lag 1, MA(1)

pacf(diff(asianHC$hate_crime), lag=50) #cut off at 12, SAR(1)

#1. start with SAR(1) included
mod6A=Arima(asianHC$hate_crime,order=c(0,1,0), seasonal=list(order=c(1,0,0),period=12), include.drift = TRUE) #has warning, can't include drift if we do 2 differencing
summary(mod6A) #AIC=2411.86   AICc=2411.93   BIC=2423.51
## Series: asianHC$hate_crime 
## ARIMA(0,1,0)(1,0,0)[12] with drift 
## 
## Coefficients:
##         sar1   drift
##       0.0920  0.0210
## s.e.  0.0557  0.3998
## 
## sigma^2 = 47.89:  log likelihood = -1202.93
## AIC=2411.86   AICc=2411.93   BIC=2423.51
## 
## Training set error measures:
##                        ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.002045848 6.891671 5.126097 -8.685741 33.34394 0.9909902
##                    ACF1
## Training set -0.3949622
coeftest(mod6A) #drift not sig.
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value Pr(>|z|)  
## sar1  0.092017   0.055728  1.6512  0.09871 .
## drift 0.021049   0.399796  0.0526  0.95801  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod6.2A=Arima(asianHC$hate_crime,order=c(0,1,0),seasonal=list(order=c(1,0,0),period=12), include.drift =FALSE)
summary(mod6.2A) #AIC=2409.87   AICc=2409.9   BIC=2417.63 #better 
## Series: asianHC$hate_crime 
## ARIMA(0,1,0)(1,0,0)[12] 
## 
## Coefficients:
##         sar1
##       0.0920
## s.e.  0.0557
## 
## sigma^2 = 47.76:  log likelihood = -1202.93
## AIC=2409.87   AICc=2409.9   BIC=2417.63
## 
## Training set error measures:
##                      ME     RMSE      MAE      MPE     MAPE      MASE
## Training set 0.01707476 6.891697 5.125928 -8.55184 33.31935 0.9909575
##                    ACF1
## Training set -0.3949598
tsplot(mod6.2A$residuals) #fairly okay, outlier

adf.test(mod6.2A$residuals) #stationary
## Warning in adf.test(mod6.2A$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod6.2A$residuals
## Dickey-Fuller = -10.103, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod6.2A$residuals) #stationary
## Warning in pp.test(mod6.2A$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod6.2A$residuals
## Dickey-Fuller Z(alpha) = -412.22, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod6.2A$residuals) #stationary
## Warning in kpss.test(mod6.2A$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod6.2A$residuals
## KPSS Level = 0.024991, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod6.2A$residuals)) #stationary
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.844  -3.898  -0.273   3.143  36.224 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -2.60904    0.20230 -12.897  < 2e-16 ***
## yd.diff.lag1  1.03044    0.17641   5.841 1.19e-08 ***
## yd.diff.lag2  0.61514    0.14000   4.394 1.48e-05 ***
## yd.diff.lag3  0.34612    0.09728   3.558 0.000425 ***
## yd.diff.lag4  0.15670    0.05264   2.977 0.003117 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.971 on 350 degrees of freedom
## Multiple R-squared:  0.7371, Adjusted R-squared:  0.7333 
## F-statistic: 196.3 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -12.8966 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod6.2A$residuals, lag=50) #lag at 1, MA(1) more obvious

pacf(mod6.2A$residuals, lag=50) #1,2, 3,5 signifcant, try AR(1)

#2. add MA(1)
mod7A=Arima(asianHC$hate_crime,order=c(0,1,1), seasonal=list(order=c(1,0,0),period=12), include.drift =FALSE)
summary(mod7A) #AIC=2301.27   AICc=2301.34   BIC=2312.92 #better
## Series: asianHC$hate_crime 
## ARIMA(0,1,1)(1,0,0)[12] 
## 
## Coefficients:
##           ma1    sar1
##       -0.7400  0.1081
## s.e.   0.0615  0.0603
## 
## sigma^2 = 35.11:  log likelihood = -1147.63
## AIC=2301.27   AICc=2301.34   BIC=2312.92
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE      ACF1
## Training set 0.105185 5.901044 4.425677 -9.26446 28.25707 0.8555833 0.1232071
tsplot(mod7A$residuals) #fairly okay

adf.test(mod7A$residuals) #stationary
## Warning in adf.test(mod7A$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod7A$residuals
## Dickey-Fuller = -8.3069, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod7A$residuals) #stationary
## Warning in pp.test(mod7A$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod7A$residuals
## Dickey-Fuller Z(alpha) = -297.59, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod7A$residuals) #stationary
## Warning in kpss.test(mod7A$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod7A$residuals
## KPSS Level = 0.072445, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod7A$residuals)) #stationary
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.627  -3.459  -0.044   3.320  36.058 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.05619    0.11235  -9.401   <2e-16 ***
## yd.diff.lag1  0.17189    0.09949   1.728   0.0849 .  
## yd.diff.lag2  0.14471    0.08643   1.674   0.0950 .  
## yd.diff.lag3  0.13784    0.07145   1.929   0.0545 .  
## yd.diff.lag4  0.09184    0.05367   1.711   0.0879 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.87 on 350 degrees of freedom
## Multiple R-squared:  0.4462, Adjusted R-squared:  0.4383 
## F-statistic: 56.39 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -9.4008 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod7A$residuals, lag=50) #small lag at 1.

pacf(mod7A$residuals, lag=50) #lag 1 sig. AR(1)

#3. add AR(1)
mod8A=Arima(asianHC$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,0),period=12), include.drift =FALSE)
summary(mod8A) #AIC=2289.04   AICc=2289.16   BIC=2304.58 #better
## Series: asianHC$hate_crime 
## ARIMA(1,1,1)(1,0,0)[12] 
## 
## Coefficients:
##          ar1      ma1    sar1
##       0.2633  -0.8919  0.1089
## s.e.  0.0634   0.0310  0.0596
## 
## sigma^2 = 33.82:  log likelihood = -1140.52
## AIC=2289.04   AICc=2289.16   BIC=2304.58
## 
## Training set error measures:
##                     ME  RMSE     MAE       MPE     MAPE      MASE        ACF1
## Training set 0.1467428 5.783 4.32512 -9.216289 27.78039 0.8361433 -0.01144035
tsplot(mod8A$residuals) #fairly okay

adf.test(mod8A$residuals) #stationary
## Warning in adf.test(mod8A$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod8A$residuals
## Dickey-Fuller = -7.0533, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod8A$residuals) #stationary
## Warning in pp.test(mod8A$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod8A$residuals
## Dickey-Fuller Z(alpha) = -383.84, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod8A$residuals) #stationary
## Warning in kpss.test(mod8A$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod8A$residuals
## KPSS Level = 0.18526, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod8A$residuals)) #stationary
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.280  -3.612  -0.159   3.336  36.297 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.93726    0.11441  -8.192 4.87e-15 ***
## yd.diff.lag1 -0.08013    0.10400  -0.770    0.442    
## yd.diff.lag2 -0.03269    0.09208  -0.355    0.723    
## yd.diff.lag3  0.03389    0.07614   0.445    0.656    
## yd.diff.lag4  0.05033    0.05340   0.943    0.347    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.809 on 350 degrees of freedom
## Multiple R-squared:  0.5114, Adjusted R-squared:  0.5044 
## F-statistic: 73.26 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -8.1922 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod8A$residuals, lag=50) #looks okay

pacf(mod8A$residuals, lag=50) #looks okay

#4. auto arima
mod9A=auto.arima(asianHC$hate_crime)
summary(mod9A) #(2,1,2), AIC=2293.57   AICc=2293.74   BIC=2312.99 not better
## Series: asianHC$hate_crime 
## ARIMA(2,1,2) 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2
##       0.5687  -0.0288  -1.2067  0.2743
## s.e.  0.5477   0.1608   0.5446  0.4839
## 
## sigma^2 = 34.17:  log likelihood = -1141.78
## AIC=2293.57   AICc=2293.74   BIC=2312.99
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.1325418 5.804518 4.353629 -9.451927 28.04619 0.8416546
##                     ACF1
## Training set 0.001352735
tsplot(mod9A$residuals)

acf(mod9A$residuals)

pacf(mod9A$residuals) #looks okay but AIC and BIC is better in mod8A

#5. add extra term and see if model gets better
#add SMA(1)
mod10A=Arima(asianHC$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift =FALSE)
coeftest(mod10A) #all terms sig.
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ar1   0.237805   0.066895   3.5549 0.0003781 ***
## ma1  -0.879696   0.034709 -25.3447 < 2.2e-16 ***
## sar1  0.910623   0.099283   9.1720 < 2.2e-16 ***
## sma1 -0.837127   0.131801  -6.3514 2.133e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod10A) #AIC=2285.7   AICc=2285.87   BIC=2305.12 #AIC got better, BIC got worse, since we aim to do prediction, select model with better AIC.
## Series: asianHC$hate_crime 
## ARIMA(1,1,1)(1,0,1)[12] 
## 
## Coefficients:
##          ar1      ma1    sar1     sma1
##       0.2378  -0.8797  0.9106  -0.8371
## s.e.  0.0669   0.0347  0.0993   0.1318
## 
## sigma^2 = 33.32:  log likelihood = -1137.85
## AIC=2285.7   AICc=2285.87   BIC=2305.12
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.1810031 5.731914 4.269975 -8.476687 27.25652 0.8254825
##                      ACF1
## Training set -0.009022443
tsplot(mod10A$residuals) #fairly okay

adf.test(mod10A$residuals) #stationary
## Warning in adf.test(mod10A$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod10A$residuals
## Dickey-Fuller = -7.0277, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod10A$residuals) #stationary
## Warning in pp.test(mod10A$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod10A$residuals
## Dickey-Fuller Z(alpha) = -380.79, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod10A$residuals) #stationary
## Warning in kpss.test(mod10A$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod10A$residuals
## KPSS Level = 0.18343, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod10A$residuals)) #stationary
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.553  -3.533  -0.251   3.305  36.495 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.93549    0.11487  -8.144 6.81e-15 ***
## yd.diff.lag1 -0.07901    0.10446  -0.756    0.450    
## yd.diff.lag2 -0.04352    0.09237  -0.471    0.638    
## yd.diff.lag3  0.02606    0.07612   0.342    0.732    
## yd.diff.lag4  0.04763    0.05344   0.891    0.373    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.759 on 350 degrees of freedom
## Multiple R-squared:  0.5097, Adjusted R-squared:  0.5027 
## F-statistic: 72.77 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -8.1442 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod10A$residuals, lag=50) #looks okay

pacf(mod10A$residuals, lag=50) #looks okay

#comparing 6 different SARIMA models, our best model is:
mod10A=Arima(asianHC$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift =FALSE)
summary(mod10A) #AIC=2285.7   AICc=2285.87   BIC=2305.12 
## Series: asianHC$hate_crime 
## ARIMA(1,1,1)(1,0,1)[12] 
## 
## Coefficients:
##          ar1      ma1    sar1     sma1
##       0.2378  -0.8797  0.9106  -0.8371
## s.e.  0.0669   0.0347  0.0993   0.1318
## 
## sigma^2 = 33.32:  log likelihood = -1137.85
## AIC=2285.7   AICc=2285.87   BIC=2305.12
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.1810031 5.731914 4.269975 -8.476687 27.25652 0.8254825
##                      ACF1
## Training set -0.009022443
coeftest(mod10A)
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ar1   0.237805   0.066895   3.5549 0.0003781 ***
## ma1  -0.879696   0.034709 -25.3447 < 2.2e-16 ***
## sar1  0.910623   0.099283   9.1720 < 2.2e-16 ***
## sma1 -0.837127   0.131801  -6.3514 2.133e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cor(fitted(mod10A), asianHC$hate_crime)^2 #0.52753
## [1] 0.52753
#check normality
qqPlot(mod10A$residuals) #fairly normal only several outliers

## [1] 351 352
densityplot(as.numeric(mod10A$residuals)) #some outliers

shapiro.test(mod10A$residuals) #not normal, we can't trust forecast interval unless dropping outliers and test them again
## 
##  Shapiro-Wilk normality test
## 
## data:  mod10A$residuals
## W = 0.94955, p-value = 9.472e-10

Best model is SARIMA (1,1,1)x(1,0,1).

Compare function of time model and SARIMA model.

#best function of time model: seasonal model
t=1:360
mod5A=lm(hate_crime~0+t+as.factor(month), data=asianHC)
#summary(mod5A) #Adjusted R-squared:  0.8716 
AIC(mod5A) #2450.807 but residuals were not linear
## [1] 2450.807
BIC(mod5A) #2505.212
## [1] 2505.212
#best SARIMA model for the diff data: 
mod10A=Arima(asianHC$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift =FALSE)
#summary(mod10A) #AIC=2285.7   AICc=2285.87   BIC=2305.12 

#compare forecasting performance of two models
c=asianHC[1:348,] #including only through 2018
t=1:348
foftimeModA = lm(hate_crime~0+t+I(t^2)+as.factor(month), data=c)
AIC(foftimeModA) #2260.843
## [1] 2260.843
BIC(foftimeModA) #2318.626
## [1] 2318.626
foftimePredA= predict(foftimeModA, data.frame(t=349:360, month=1:12))

sarimaModA=Arima(c$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift = FALSE) 
#summary(sarimaModA) #AIC=2148.5   AICc=2148.67   BIC=2167.74
sarimaAPred=forecast(sarimaModA,h=12)$mean

#real data
real_value3=asianHC$hate_crime[c(349:360)]

#compare with real data
cor(real_value3,foftimePredA)^2 # R^2 = 0.3780229
## [1] 0.3780229
cor(real_value3,sarimaAPred)^2 # R^2 =0.008795079, might be due to surge in 2020?
## [1] 0.008795079
#compare forecasting performance of two models using different subsets
c2=asianHC[1:324,] #including only through 2017
t=1:324
foftimeModA2 = lm(hate_crime~0+t+I(t^2)+as.factor(month), data=c2)
AIC(foftimeModA2) #2089.95
## [1] 2089.95
BIC(foftimeModA2) #2146.661
## [1] 2146.661
foftimePredA2= predict(foftimeModA2, data.frame(t=325:348, month=c(1:12, 1:12))) #predict 2018,2019

sarimaModA2=Arima(c2$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift = FALSE) 
#summary(sarimaModA2) #AIC=2013.28   AICc=2013.47   BIC=2032.16
sarimaAPred2=forecast(sarimaModA2,h=24)$mean
sarimaAPred2
## Time Series:
## Start = 325 
## End = 348 
## Frequency = 1 
##  [1] 10.824660 11.874799 10.727771 10.601041 11.583699 11.731505 10.057739
##  [8] 10.976842  9.944127 10.199161 10.890517 10.432550 10.139316 11.586934
## [15] 10.642463 10.541123 11.417726 11.549579 10.059426 10.877730  9.958281
## [22] 10.185344 10.800873 10.393135
#p=data.frame(sarimaAPred2)

#real data
real_value4=asianHC$hate_crime[c(325:348)]

#compare with real data
cor(real_value4,foftimePredA2)^2 # R^2 = 0.0540556
## [1] 0.0540556
cor(real_value4,sarimaAPred2)^2 # R^2 = 0.1645694
## [1] 0.1645694

SARIMA model is better.

Prediction using the best SARIMA model

#plot prediction
pred_asianHC=forecast(mod10A,h=24) #two years, 2021
pred_asianHC
##     Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## 361       20.08305 12.68576 27.48035  8.769866 31.39624
## 362       21.98624 14.12892 29.84355  9.969509 34.00296
## 363       24.92636 16.92340 32.92933 12.686888 37.16584
## 364       25.14045 17.04025 33.24066 12.752258 37.52865
## 365       24.52689 16.34011 32.71368 12.006283 37.04750
## 366       24.51231 16.24201 32.78261 11.863982 37.16064
## 367       23.29950 14.94703 31.65197 10.525506 36.07349
## 368       22.74114 14.30742 31.17485  9.842882 35.63939
## 369       22.66157 14.14741 31.17573  9.640281 35.68285
## 370       22.86413 14.27029 31.45798  9.720978 36.00729
## 371       22.74483 14.07203 31.41763  9.480922 36.00873
## 372       22.65195 13.90091 31.40299  9.268392 36.03552
## 373       22.26226 13.34548 31.17905  8.625213 35.89931
## 374       23.01848 13.99824 32.03873  9.223204 36.81376
## 375       25.46353 16.35302 34.57404 11.530202 39.39686
## 376       25.60324 16.40607 34.80041 11.537384 39.66910
## 377       25.03138 15.74900 34.31376 10.835207 39.22756
## 378       25.01498 15.64831 34.38164 10.689898 39.34006
## 379       23.90982 14.45966 33.35999  9.457041 38.36260
## 380       23.40119 13.86826 32.93411  8.821835 37.98054
## 381       23.32869 13.71372 32.94366  8.623858 38.03352
## 382       23.51314 13.81682 33.20946  8.683894 38.34239
## 383       23.40449 13.62750 33.18149  8.451867 38.35712
## 384       23.31992 13.46291 33.17693  8.244924 38.39492
plot(pred_asianHC, main="Prediction on anti-Asian hate crimes", xlab="Time", ylab="Hate crimes")

forecasting4<-sarima.for(asianHC$hate_crime, 1,1,1, 1,0,1, 12, n.ahead = 24)

forecasting4$pred
## Time Series:
## Start = 361 
## End = 384 
## Frequency = 1 
##  [1] 20.17948 22.12497 25.11162 25.34448 24.73837 24.73971 23.53464 22.98840
##  [9] 22.92469 23.14388 23.04077 22.96991 22.59723 23.37714 25.86200 26.02047
## [17] 25.45813 25.45884 24.36393 23.86925 23.81388 24.01604 23.92484 23.86301

Additional things in mind: log transformation on Asian-hate crimes

tsplot(diff(log(asianHC$hate_crime))) #might fix the issue with variance

adf.test(diff(log(asianHC$hate_crime)))
## Warning in adf.test(diff(log(asianHC$hate_crime))): p-value smaller than printed
## p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(log(asianHC$hate_crime))
## Dickey-Fuller = -10.644, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(log(asianHC$hate_crime)))
## Warning in pp.test(diff(log(asianHC$hate_crime))): p-value smaller than printed
## p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff(log(asianHC$hate_crime))
## Dickey-Fuller Z(alpha) = -419.79, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(log(asianHC$hate_crime))) 
## Warning in kpss.test(diff(log(asianHC$hate_crime))): p-value greater than
## printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff(log(asianHC$hate_crime))
## KPSS Level = 0.033609, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(diff(log(asianHC$hate_crime)))) #pass all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.39007 -0.21611  0.01356  0.22069  1.25530 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -2.90384    0.21353 -13.599  < 2e-16 ***
## yd.diff.lag1  1.23575    0.18625   6.635 1.24e-10 ***
## yd.diff.lag2  0.72880    0.14733   4.947 1.18e-06 ***
## yd.diff.lag3  0.43078    0.09986   4.314 2.09e-05 ***
## yd.diff.lag4  0.16984    0.05229   3.248  0.00127 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3372 on 349 degrees of freedom
## Multiple R-squared:  0.7667, Adjusted R-squared:  0.7634 
## F-statistic: 229.4 on 5 and 349 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -13.5994 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(diff(log(asianHC$hate_crime)))